Summary

This script takes the geotagged tweets and assigns each unique user a user type, either as a local or a tourist. If the user has self-identified their location as somewhere in the Santa Barbara area, they are designated a local. This includes Carpinteria, Santa Barbara, Montecito, Goleta, Gaviota and UCSB. For the remainder, we use the number of times they have tweeted from Santa Barbara within a year to designate user type. If someone has tweeted across more than 2 months in the same year from Santa Barbara, they are identified as a local. This is consistent with how Eric Fischer determined tourists in his work. This is not fool-proof and there are instances were people visit and tweet from Santa Barbara more than two months a year, especially if they are visiting family or live within a couple hours driving distance.

Setup

knitr::opts_chunk$set(echo = TRUE, message = F, warning = F)

library(tidyverse)
library(leaflet)
library(RColorBrewer)
library(ggmap)
library(viridis)
library(sf)
library(mapview)
#remotes::install_github("wilkelab/ggtext")
library(ggtext)

register_google(Sys.getenv("GOOGLE_ACCESS_TOKEN"))

Identify tourists

Think of another way to identify tourists and locals. Using Eric Fisher’s methods, we identify a local as someone who has tweeted from the Santa Barbara area within a month range. A tourist is less likely to tweet multiple times within a month.

tweet_df <- read_csv("../data/geotagged_sb_tweets_post_apr_2015.csv") 

num_month_tweets <- tweet_df %>%
  group_by(user_id, user_location, Year) %>%
  summarize(num_unique_months_tweeted = length(unique(month_num))) %>%
  ungroup() 

We have 67231 tweets from 14353 people.

What we really want to do is use the number of tweets in a month/year range to tell us whether or not someone is a local or tourist. We can do this across the entire dataset, and then compare to our user location designation to see how right or wrong we were.

Our assumption is that a locals tweets will occur across multiple months within a single year, whereas a tourist is most likely to only tweet within a month (when visiting) or multiple visits across multiple months/years.

First, if folk self-identify as from the Santa Barbara area in their user location, we assign them locals, otherwise unknown.

Then we take the unknowns and apply a couple rules. 1. if they have tweeted in more than 2 months within the same year they are local 2. if they have only tweeted in 1 or 2 months within a year they are a tourist

tourist_local <- num_month_tweets %>%
  group_by(user_id, user_location) %>% 
  mutate(user_type = case_when(
    str_detect(tolower(user_location), paste(c("santa barbara", "carpinteria", "isla vista", "goleta", "montecito", "805", "gaviota", "ucsb"), collapse = '|')) ~ "local",
     TRUE ~ "unknown"))


ggplot(tourist_local) +
  geom_histogram(aes(num_unique_months_tweeted), bins = 12) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 12, 1), lim = c(0, 12)) +
  facet_wrap(~user_type, scales = 'free') +
  labs(x = "Number of months tweeted in a year",
       y = "Number of users",
       title = "How many unique months are SB geotag tweeters tweeting?") 

ggsave("../figs/user_threshold.png")

Based on this lets stick with the 3 month threshold.

tourist_local <- num_month_tweets %>%
  group_by(user_id, user_location) %>% 
  mutate(user_type = case_when(
    str_detect(tolower(user_location), paste(c("santa barbara", "carpinteria", "isla vista", "goleta", "montecito", "805", "gaviota", "ucsb"), collapse = '|')) ~ "local",
     TRUE ~ "unknown")) %>%
  mutate(user_type = case_when(
    user_type == "unknown" & num_unique_months_tweeted > 2 ~ "local", #if someone has tweeted in at least 2 months of a single year we will id them as local
    user_type == "unknown" & num_unique_months_tweeted <= 2 ~ "tourist",
    TRUE ~ as.character(user_type)
  )) %>%
  select(-num_unique_months_tweeted, -Year) %>%
  distinct()

The way we assigned “tourist” or “local” by year gives us some cases where in one year. For those users that were identified as a tourist in one year and a local in another, we will default and call them all local.

# show me the duplicates
dups <- tourist_local$user_id[duplicated(tourist_local$user_id)]

tourist_local_update <- tourist_local %>%
  mutate(user_type = ifelse(user_id %in% dups, "local", user_type)) %>%
  distinct()
tweet_df_w_user_type <- tweet_df %>%
  left_join(tourist_local_update)

There are 21811 tweets from tourists and 45420 tweets from locals.

Save

write_csv(tweet_df_w_user_type, "../data/geotag_sb_tweets_user_type.csv")

Maps

Turn the tweet_df_w_user_type data frame into a spatial object.

tweet_df_w_user_type_sf <- tweet_df_w_user_type %>%
  st_as_sf(coords = c("lon", "lat"), remove = F) %>%
  st_set_crs(4326)

Static map

static_df <- tweet_df_w_user_type_sf %>%
  select(user_type, geometry, lat, lon) 
#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 14, maptype = "toner-lite") 

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_point(data = static_df, aes(x = lon, y = lat, color = user_type),
               size = 0.5, alpha = 0.3) + 
    scale_color_manual(values = c("red", "blue")) +
  labs(fill = "User type",
       title = "Santa Barbara tweets from <b style='color:#FF0000'>locals</b> and <b style='color:#0000FF'>tourists</b>",
       subtitle = "2015 - 2019") +
  theme(plot.title = element_markdown(lineheight = 1.1),
        plot.subtitle = element_markdown(lineheight = 1.1),
        legend.position = "none")

Hex density maps

Interactive hex density

Get hex density by overlaying with points

hex_grid <- read_sf("../data/sb_area_hexagons.shp") %>%
  st_transform(st_crs(tweet_df_w_user_type_sf))
locals <- tweet_df_w_user_type_sf %>% filter(user_type == "local")
tourists <- tweet_df_w_user_type_sf %>% filter(user_type == "tourist")

hex_tweet_count <- hex_grid %>%
  mutate(local_tweet_count = lengths(st_intersects(hex_grid, locals)),
         local_log_tweet_count = log(local_tweet_count),
         tourist_tweet_count = lengths(st_intersects(hex_grid, tourists))) %>%
  mutate(
         tourist_log_tweet_count = log(tourist_tweet_count),
         total = local_tweet_count + tourist_tweet_count,
         diff = local_tweet_count - tourist_tweet_count)

#color palettes
blues = colorRampPalette(c("#DEEBF7", "#08306B"))
reds = colorRampPalette(c("#FEE0D2", "#67000D"))


mapview(hex_tweet_count %>% filter(local_tweet_count > 0), 
        zcol = "local_log_tweet_count", 
        layer.name = "# tweets by locals (log)",
        col.regions = blues,
        alpha.regions = 0.5) + 
  mapview(hex_tweet_count %>% filter(tourist_tweet_count > 0), 
          zcol = "tourist_log_tweet_count", 
          layer.name = "# tweets by tourists (log)",
        col.regions = reds,
        alpha.regions = 0.5)

Difference map

We want to know where tourists go that locals do not and vice versa.

#where tourists go more
t <- hex_tweet_count %>% 
  filter(diff < 0) %>%
  mutate(value = -1*diff,
         log_value = log10(value))

l <- hex_tweet_count %>%
  filter(diff > 0) %>%
  mutate(log_value = log10(diff))

#map
tl_map <- mapview(t, zcol = "log_value", layer.name = "Tourist preferred # tweets", 
        col.regions = colorRampPalette(brewer.pal(9, "Purples")[4:9]), alpha.regions = 1) +
  mapview(l, zcol = "log_value", layer.name = "Local preferred # tweets", 
        col.regions = colorRampPalette(brewer.pal(9, "Oranges")[4:9]), alpha.regions = 1)

tl_map@map %>% setView(lng = -119.714, lat = 34.426, zoom = 13)